require(GGally, quietly = TRUE)
require(reshape2, quietly = TRUE)
require(tidyverse, quietly = TRUE, warn.conflicts = FALSE)
package ‘tidyverse’ was built under R version 3.3.2package ‘ggplot2’ was built under R version 3.3.2package ‘tibble’ was built under R version 3.3.2package ‘tidyr’ was built under R version 3.3.2
library(ggfortify)
library(cluster)
library(ggdendro)
library(broom)
theme_set(theme_bw())
source("github-lib.R")
dw %>%
select(-repository_language) %>%
ggpairs()
plot: [1,1] [===---------------------------------------] 6% est: 0s
plot: [1,2] [=====-------------------------------------] 12% est: 1s
plot: [1,3] [========----------------------------------] 19% est: 1s
plot: [1,4] [==========--------------------------------] 25% est: 1s
plot: [2,1] [=============-----------------------------] 31% est: 1s
plot: [2,2] [================--------------------------] 38% est: 1s
plot: [2,3] [==================------------------------] 44% est: 1s
plot: [2,4] [=====================---------------------] 50% est: 1s
plot: [3,1] [========================------------------] 56% est: 1s
plot: [3,2] [==========================----------------] 62% est: 1s
plot: [3,3] [=============================-------------] 69% est: 0s
plot: [3,4] [================================----------] 75% est: 0s
plot: [4,1] [==================================--------] 81% est: 0s
plot: [4,2] [=====================================-----] 88% est: 0s
plot: [4,3] [=======================================---] 94% est: 0s
plot: [4,4] [==========================================]100% est: 0s
As variáveis são bastante assimétricas e concentradas em pequenos valores. Transformá-las para log ajuda na visualização.
dw2.scaled %>%
select(-repository_language) %>%
ggpairs()
plot: [1,1] [===---------------------------------------] 6% est: 0s
plot: [1,2] [=====-------------------------------------] 12% est: 1s
plot: [1,3] [========----------------------------------] 19% est: 1s
plot: [1,4] [==========--------------------------------] 25% est: 1s
plot: [2,1] [=============-----------------------------] 31% est: 1s
plot: [2,2] [================--------------------------] 38% est: 1s
plot: [2,3] [==================------------------------] 44% est: 1s
plot: [2,4] [=====================---------------------] 50% est: 1s
plot: [3,1] [========================------------------] 56% est: 1s
plot: [3,2] [==========================----------------] 62% est: 1s
plot: [3,3] [=============================-------------] 69% est: 0s
plot: [3,4] [================================----------] 75% est: 0s
plot: [4,1] [==================================--------] 81% est: 0s
plot: [4,2] [=====================================-----] 88% est: 0s
plot: [4,3] [=======================================---] 94% est: 0s
plot: [4,4] [==========================================]100% est: 0s
hc %>%
cutree(k = n_clusters)
ActionScript Ada Agda ANTLR Apex AppleScript Arc Arduino ASP
1 2 1 2 3 1 1 1 1
Assembly Augeas AutoHotkey AutoIt Awk BlitzBasic Brightscript C C#
4 3 1 1 1 4 4 4 4
C++ Ceylon Clojure COBOL CoffeeScript ColdFusion Common Lisp Coq Crystal
4 2 4 4 4 1 1 2 4
CSS D Dart DCPU-16 ASM Delphi DOT Dylan Ecl Eiffel
4 1 4 2 2 1 2 2 3
Elixir Elm Emacs Lisp Erlang F# Factor Fancy FORTRAN Go
4 1 1 1 4 3 2 1 4
Gosu Groovy Haskell Haxe HaXe IDL Idris Java JavaScript
1 1 1 4 2 4 1 4 4
Julia Kotlin LiveScript Logos Lua M Matlab Max Mirah
4 1 4 1 1 1 1 3 2
Monkey MoonScript Nemerle nesC NetLogo Nimrod Objective-C Objective-J OCaml
2 2 1 4 2 3 4 3 1
ooc Opa OpenEdge ABL Oxygene Parrot Pascal Perl Perl6 PHP
2 2 1 1 3 1 1 2 4
PogoScript PowerShell Processing Prolog Puppet Pure Data Python R Racket
2 1 1 1 1 2 4 1 1
REALbasic Rebol RobotFramework Ruby Rust Scala Scheme Scilab Self
2 2 3 1 4 4 1 2 1
Shell Slash Smalltalk Squirrel Standard ML SuperCollider Tcl TeX Turing
1 4 2 2 4 2 1 4 2
TypeScript UnrealScript Vala Verilog VHDL VimL Visual Basic wisp XC
4 2 4 1 2 1 1 3 2
XProc XQuery XSLT Xtend
2 1 4 1
library(plotly)
package ‘plotly’ was built under R version 3.3.2
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
p <- dw2 %>%
plot_ly(type = 'parcoords',
line = list(color = ~cluster),
dimensions = list(
#list(range = c(1, 4), label = "cluster", values = ~cluster),
list(range = c(0, 4),
label = 'Forks/repo', values = ~ForkEvent),
list(range = c(0, 4),
constraintrange = c(5,6),
label = 'Issues/repo', values = ~IssuesEvent),
list(range = c(0, 4),
label = 'Pushes/repo', values = ~PushEvent),
list(range = c(0, 4),
label = 'Watches/repo', values = ~WatchEvent)
)
)
p
# O agrupamento de fato:
km = dw2.scaled %>%
select(-repository_language) %>%
kmeans(centers = n_clusters, nstart = 20)
# O df em formato longo, para visualização
dw2.scaled.km.long = km %>%
augment(dw2.scaled) %>% # Adiciona o resultado de km
# aos dados originais dw2.scaled em
# uma variável chamada .cluster
gather(key = "variável",
value = "valor",
-repository_language, -.cluster) # = move para long todas as
# variávies menos repository_language
# e .cluster
dw2.scaled.km.long %>%
ggplot(aes(x = `variável`, y = valor, group = repository_language, colour = .cluster)) +
#geom_point(alpha = 0.2) +
geom_line(alpha = .5) +
facet_wrap(~ .cluster)
autoplot(km, data = dw2.scaled, label = TRUE)
dists = dw2.scaled %>%
select(-repository_language) %>%
dist() # só para plotar silhouetas depois
plot(silhouette(km$cluster, dists), col = RColorBrewer::brewer.pal(n_clusters, "Set2"))
Qual seria um bom valor de k? Uma medida comumente usada no kmeans é comparar a distância (quadrática) entre o centro dos clusters e o centro dos dados com a distância (quadrática) entre os pontos todos nos dados e o centro dos dados. Aqui o centro dos dados é um ponto imaginário na média de todas as variáveis. Calculamos a distância do centro de cada cluster para o centro dos dados e multiplicamos pelo número de pontos nesse cluster. Somando esse valor para todos os clusters, temos betweenss abaixo. Se esse valor for próximo do somatório total das distâncias dos pontos para o centro dos dados (totss), os pontos estão próximos do centro de seu cluster. Essa proporção pode ser usada para definir um bom valor de k. Quando ela para de crescer, para de valer à pena aumentar k.
set.seed(123)
explorando_k = tibble(k = 1:15) %>%
group_by(k) %>%
do(
kmeans(select(dw2.scaled, -repository_language),
centers = .$k,
nstart = 20) %>% glance()
)
explorando_k %>%
ggplot(aes(x = k, y = betweenss / totss)) +
geom_line() +
geom_point()
O dataset ruspini é clássico para ilustrar agrupamento.
dists = dist(rs, method = "euclidean")
hc = hclust(dists, method = "ward.D")
plot(hc, hang = -1, cex = 0.8)
rect.hclust(hc, k=4)
rs$cluster = factor(cutree(hc, k=4))
ggplot(rs, aes(x = x, y = y, colour = cluster)) +
geom_point(size = 3)
rs$cluster = factor(cutree(hc, k=8))
ggplot(rs, aes(x = x, y = y, colour = cluster, label = cluster)) +
geom_point(size = 2) +
geom_text(hjust = -.1, vjust = 1) +
xlim(0, 150)
plot(silhouette(cutree(hc, k = 4), dists))
plot(silhouette(cutree(hc, k = 6), dists))
#heatmap(as.matrix(dw2[,1:4]), Colv=F, scale='none')
#hc.data <- dendro_data(hc)
#ggdendrogram(hc.data, rotate = TRUE) +
#labs(title = "Agrupamento de Rustini")
km <- kmeans(rs, centers=4, nstart=10)
km
autoplot(km, data = rs)
autoplot(km, data = rs, frame = TRUE)